I had to do a regression, perfect opportunity to use Scikit-learn... This was largely inspired by this more complete example
In [ ]:
import re
from sklearn import datasets, linear_model
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
In [17]:
a = "32 0.000<br> 64 0.002<br> 128 0.016<br> 256 0.212<br> 512 2.102<br> 1024 22.744<br> 2048 248.011<br> 4096 2670.298"
tab_N = []
tab_time = []
p = re.compile('\s*(\d+)\s+(\d+.\d+)')
for i in a.split("<br>"):
m = p.match(i)
tab_N.append(int(m.group(1)))
tab_time.append(float(m.group(2)))
s_N = pd.Series(tab_N[1:])
s_time = pd.Series(tab_time[1:])
plt.scatter(s_N,s_time)
#Data appears to be exponential... Let's do a log...
s_log_N = np.log2(s_N)
s_log_time = np.log2(s_time)
In [36]:
df = pd.DataFrame({'N':s_log_N,'time':s_log_time})
X_train, X_test, y_train, y_test = train_test_split(df[["N"]], df[["time"]])
regr = linear_model.LinearRegression()
regr.fit(X_train,y_train)
print "The slope is : %0.2f" % (regr.coef_)
predicted_all = regr.predict(df[["N"]])
predicted_train = regr.predict(X_train)
predicted_test = regr.predict(X_test)
#Plotting the results and comparaison with the test sample
plt.scatter(s_log_N,s_log_time, color='yellow',alpha=0.6)
plt.plot(df[["N"]], predicted_all, c='g', alpha=0.5)
#The "test sample", i.e. the 2 point in red are actually quite closed to the prediction
plt.scatter(X_test, y_test,s=60, c='r', alpha=0.9)
Out[36]:
In [45]:
plt.scatter(predicted_test, (predicted_test- y_test)/y_test*100, c='g', s=40)
plt.scatter(predicted_train, (predicted_train- y_train)/y_train*100, c='b', s=40, alpha=0.5)
print("The test sample error is "+", ".join(["%0.2f%%" % x[0] for x in ((predicted_test-y_test)/y_test)*100]))